<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Ngrams for Compound Words | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="partial-matching.html" title="Partial Matching">
<link rel="prev" href="_index_time_search_as_you_type.html" title="Index-Time Search-as-You-Type">
<link rel="next" href="controlling-relevance.html" title="Controlling Relevance">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="search-in-depth.html">Search in Depth</a></span>
»
<span class="breadcrumb-link"><a href="partial-matching.html">Partial Matching</a></span>
»
<span class="breadcrumb-node">Ngrams for Compound Words</span>
</div>
<div class="navheader">
<span class="prev">
<a href="_index_time_search_as_you_type.html">« Index-Time Search-as-You-Type</a>
</span>
<span class="next">
<a href="controlling-relevance.html">Controlling Relevance »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="ngrams-compound-words"></a>Ngrams for Compound Words<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/130_Partial_Matching/40_Compound_words.asciidoc">edit</a>
</h2>
</div></div></div>
<p>Finally, let’s take a look at how n-grams can be used to search languages with
compound words.  German is famous for combining several small words into one
massive compound word in order to capture precise or complex meanings. For
example:</p>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<em>Aussprachewörterbuch</em>
</span>
</dt>
<dd>
Pronunciation dictionary
</dd>
<dt>
<span class="term">
<em>Militärgeschichte</em>
</span>
</dt>
<dd>
Military history
</dd>
<dt>
<span class="term">
<em>Weißkopfseeadler</em>
</span>
</dt>
<dd>
White-headed sea eagle, or bald eagle
</dd>
<dt>
<span class="term">
<em>Weltgesundheitsorganisation</em>
</span>
</dt>
<dd>
World Health Organization
</dd>
<dt>
<span class="term">
<em>Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz</em>
</span>
</dt>
<dd>
The law concerning the delegation of duties for the supervision of cattle
marking and the labeling of beef
</dd>
</dl>
</div>
<p>Somebody searching for “Wörterbuch” (dictionary) would probably expect to
see “Aussprachewörtebuch” in the results list. Similarly, a search for
“Adler” (eagle) should include “Weißkopfseeadler.”</p>
<p>One approach to indexing languages like this is to break compound words into
their constituent parts using the <a href="/guide/en/elasticsearch/reference/2.4/analysis-compound-word-tokenfilter.html" class="ulink" target="_top">compound word token filter</a>.
However, the quality of the results depends on how good your compound-word
dictionary is.</p>
<p>Another approach is just to break all words into n-grams and to search for any
matching fragments—​the more fragments that match, the more relevant the
document.</p>
<p>Given that an n-gram is a moving window on a word, an n-gram of any length
will cover all of the word.  We want to choose a length that is long enough
to be meaningful, but not so long that we produce far too many unique terms.
A <em>trigram</em> (length 3) is probably a good starting point:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">PUT /my_index
{
    "settings": {
        "analysis": {
            "filter": {
                "trigrams_filter": {
                    "type":     "ngram",
                    "min_gram": 3,
                    "max_gram": 3
                }
            },
            "analyzer": {
                "trigrams": {
                    "type":      "custom",
                    "tokenizer": "standard",
                    "filter":   [
                        "lowercase",
                        "trigrams_filter"
                    ]
                }
            }
        }
    },
    "mappings": {
        "my_type": {
            "properties": {
                "text": {
                    "type":     "string",
                    "analyzer": "trigrams" <a id="CO100-1"></a><i class="conum" data-value="1"></i>
                }
            }
        }
    }
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/130_Partial_Matching/40_Compound_words.json"></div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO100-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>The <code class="literal">text</code> field uses the <code class="literal">trigrams</code> analyzer to index its contents as
n-grams of length 3.</p>
</td>
</tr>
</table>
</div>
<p>Testing the trigrams analyzer with the <code class="literal">analyze</code> API</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /my_index/_analyze?analyzer=trigrams
Weißkopfseeadler</pre>
</div>
<div class="sense_widget" data-snippet="snippets/130_Partial_Matching/40_Compound_words.json"></div>
<p>returns these terms:</p>
<pre class="literallayout">wei, eiß, ißk, ßko, kop, opf, pfs, fse, see, eea,ead, adl, dle, ler</pre>

<p>We can index our example compound words to test this approach:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">POST /my_index/my_type/_bulk
{ "index": { "_id": 1 }}
{ "text": "Aussprachewörterbuch" }
{ "index": { "_id": 2 }}
{ "text": "Militärgeschichte" }
{ "index": { "_id": 3 }}
{ "text": "Weißkopfseeadler" }
{ "index": { "_id": 4 }}
{ "text": "Weltgesundheitsorganisation" }
{ "index": { "_id": 5 }}
{ "text": "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz" }</pre>
</div>
<div class="sense_widget" data-snippet="snippets/130_Partial_Matching/40_Compound_words.json"></div>
<p>A search for “Adler” (eagle) becomes a query for the three terms <code class="literal">adl</code>, <code class="literal">dle</code>,
and <code class="literal">ler</code>:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /my_index/my_type/_search
{
    "query": {
        "match": {
            "text": "Adler"
        }
    }
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/130_Partial_Matching/40_Compound_words.json"></div>
<p>which correctly matches “Weißkopfsee-<em>adler</em>”:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">{
  "hits": [
     {
        "_id": "3",
        "_score": 3.3191128,
        "_source": {
           "text": "Weißkopfseeadler"
        }
     }
  ]
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/130_Partial_Matching/40_Compound_words.json"></div>
<p>A similar query for “Gesundheit” (health) correctly matches
“Welt-<em>gesundheit</em>-sorganisation,” but it also matches
“Militär-<em>ges</em>-chichte” and
“Rindfleischetikettierungsüberwachungsaufgabenübertragungs-<em>ges</em>-etz,”
both of which also contain the trigram <code class="literal">ges</code>.</p>
<p>Judicious use of the <code class="literal">minimum_should_match</code> parameter can remove these
spurious results by requiring that a minimum number of trigrams must be
present for a document to be considered a match:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /my_index/my_type/_search
{
    "query": {
        "match": {
            "text": {
                "query":                "Gesundheit",
                "minimum_should_match": "80%"
            }
        }
    }
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/130_Partial_Matching/40_Compound_words.json"></div>
<p>This is a bit of a shotgun approach to full-text search and can result in a
large inverted index, but it is an effective generic way of indexing languages
that use many compound words or that don’t use whitespace between words,
such as Thai.</p>
<p>This technique is used to increase <em>recall</em>—the number of relevant
documents that a search returns.  It is usually used in combination with
other techniques, such as shingles (see <a class="xref" href="shingles.html" title="Finding Associated Words">Finding Associated Words</a>) to improve precision and
the relevance score of each document.</p>
</div>
<div class="navfooter">
<span class="prev">
<a href="_index_time_search_as_you_type.html">« Index-Time Search-as-You-Type</a>
</span>
<span class="next">
<a href="controlling-relevance.html">Controlling Relevance »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
